1.Over - Under Analysis
oo = readRDS("/Users/ilkerkurtulus/Documents/cse-master/ie582/data/hw1/df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.rds")
oo = data.table(oo)
oo$date = anytime(oo$date)
mm = readRDS("/Users/ilkerkurtulus/Documents/cse-master/ie582/data/hw1/df9b1196-e3cf-4cc7-9159-f236fe738215_matches.rds")
mm = data.table(mm)
mm[, `:=`(c("leagueId", "type"), NULL)]
mm$date = anytime(mm$date)
mm = mm[is.na(mm$score) == FALSE]
mm = mm[, `:=`(c("home_goal", "away_goal"), tstrsplit(score,
":", fixed = TRUE))]
mm$home_goal = as.numeric(mm$home_goal)
mm$away_goal = as.numeric(mm$away_goal)
mm[, `:=`(total_goals, home_goal + away_goal)]
mm = mutate(mm, is_over = ifelse(total_goals > 2.5, 1, 0))
mm = data.table(mm)
head(mm)
## matchId home away score date
## 1: KjF6FiA6 tottenham manchester city 0:0 2010-08-14 15:45:00
## 2: ILVbJgQm aston villa west ham 3:0 2010-08-14 18:00:00
## 3: SGIEDVvJ wolves stoke city 2:1 2010-08-14 18:00:00
## 4: YwL5xFHJ bolton fulham 0:0 2010-08-14 18:00:00
## 5: lQJAEBPC wigan blackpool 0:4 2010-08-14 18:00:00
## 6: byRcHDuf sunderland birmingham 2:2 2010-08-14 18:00:00
## home_goal away_goal total_goals is_over
## 1: 0 0 0 0
## 2: 3 0 3 1
## 3: 2 1 3 1
## 4: 0 0 0 0
## 5: 0 4 4 1
## 6: 2 2 4 1
Different handicap values mean different events with different probability. So we should not considered them together. Thats why from “ah” betType i will pick totalhandicap = 0 and for “ou” betType lets pick 2.5
While choosing bookmakers we need to care that bookmakers should provide odds of above handicap and bettype. To do that lets print them and choose:
oo[(betType == "ou") & (totalhandicap == "2.5"), .N, by = bookmaker]
## bookmaker N
## 1: 1xBet 10187
## 2: bet-at-home 11838
## 3: bet365 13809
## 4: Betclic 12011
## 5: BetVictor 14063
## 6: Betway 11041
## 7: bwin 11528
## 8: Expekt 9664
## 9: Paddy Power 12492
## 10: Unibet 10905
## 11: William Hill 5494
## 12: youwin 11280
## 13: Betsafe 14853
## 14: Betsson 15276
## 15: Sportingbet 10672
## 16: Tipico 10046
## 17: Pinnacle 14610
## 18: 10Bet 36443
## 19: 12BET 11148
## 20: 188BET 12306
## 21: ComeOn 11497
## 22: SBOBET 10418
## 23: Interwetten 7279
## 24: 888sport 7319
## 25: Betfair 6219
## 26: Betfair Exchange 14914
## bookmaker N
oo[(betType == "ah") & (totalhandicap == "0"), .N, by = bookmaker]
## bookmaker N
## 1: 1xBet 10584
## 2: bet365 14846
## 3: Interwetten 7285
## 4: Unibet 5283
## 5: BetVictor 6863
## 6: Paddy Power 4972
## 7: Pinnacle 9853
## 8: Betfair Exchange 12309
## 9: 10Bet 9758
## 10: 188BET 6847
## 11: ComeOn 5745
## 12: 12BET 5796
## 13: SBOBET 5568
## 14: Betsson 322
## 15: Betsafe 44
## 16: youwin 60
## 17: Expekt 30
So we can select 5 bookmakers as 1xBet, bet365, Betfair Exchange, Pinnacle and 10Bet
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_1a = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ah = 0 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
all_df = na.omit(pdf[mm[, c("matchId", "is_over")], on = "matchId"])
all_df = all_df[, `:=`(is_over, as.character(is_over))]
scaled_df = scale(all_df[, 3:9])
pca = prcomp(scaled_df, center = TRUE, scale. = TRUE)
print(summary(pca))
eigs = pca$sdev^2
exp_var_ratio = eigs/sum(eigs)
cum_exp_var_ratio = cumsum(exp_var_ratio)
plot(cum_exp_var_ratio, type = "l", xlab = "# of Principle Components",
ylab = "Cumulative Explained Variance")
title(paste("Cumulative Explained Variance Ratio of PCA for ",
bookmakers[n_bm], sep = ""))
all_pca = predict(pca, newdata = scaled_df)
all_pca2d = all_pca[, 1:2]
all_pca2d = data.table(all_pca2d)
all_pca2d[, `:=`(is_over, all_df$is_over)]
ggplot(all_pca2d, aes(x = PC1, y = PC2, color = is_over)) +
geom_point() + ggtitle(paste("Transformed Data with p = 2 PCA and is_over results",
bookmakers[n_bm], sep = " "))
}
func_1a(1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8620 1.4220 1.1528 0.31994 0.25363 0.09678
## Proportion of Variance 0.4953 0.2889 0.1898 0.01462 0.00919 0.00134
## Cumulative Proportion 0.4953 0.7842 0.9740 0.98863 0.99782 0.99916
## PC7
## Standard deviation 0.07657
## Proportion of Variance 0.00084
## Cumulative Proportion 1.00000
func_1a(2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7096 1.5209 1.2876 0.21767 0.20100 0.12273
## Proportion of Variance 0.4175 0.3304 0.2369 0.00677 0.00577 0.00215
## Cumulative Proportion 0.4175 0.7480 0.9848 0.99161 0.99738 0.99953
## PC7
## Standard deviation 0.05739
## Proportion of Variance 0.00047
## Cumulative Proportion 1.00000
func_1a(3)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7306 1.3078 1.2174 0.65847 0.53327 0.27811
## Proportion of Variance 0.4279 0.2443 0.2117 0.06194 0.04063 0.01105
## Cumulative Proportion 0.4279 0.6722 0.8839 0.94584 0.98647 0.99752
## PC7
## Standard deviation 0.13175
## Proportion of Variance 0.00248
## Cumulative Proportion 1.00000
func_1a(4)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8741 1.6091 0.8743 0.28487 0.16779 0.12440
## Proportion of Variance 0.5018 0.3699 0.1092 0.01159 0.00402 0.00221
## Cumulative Proportion 0.5018 0.8717 0.9809 0.99246 0.99648 0.99869
## PC7
## Standard deviation 0.09562
## Proportion of Variance 0.00131
## Cumulative Proportion 1.00000
func_1a(5)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.9234 1.5186 0.71947 0.61145 0.23300 0.17029
## Proportion of Variance 0.5285 0.3295 0.07395 0.05341 0.00776 0.00414
## Cumulative Proportion 0.5285 0.8579 0.93188 0.98529 0.99305 0.99719
## PC7
## Standard deviation 0.14018
## Proportion of Variance 0.00281
## Cumulative Proportion 1.00000
Using first 2 components is easy to visualize however in terms of explaining variance of the data its not beneficial. Yet in our case except bet365 all datatables has higher than 0.70 explained variance ratio so 2d is a good choice to visualization.
When we look at the scatter plots we cant find a distinction between over/under result. This is caused by selection of p as 2 in PCA and behavior of the data (ie nonlinearity) is not suitable for PCA.
Calculation of manhattan and euclidian distances as well as 2D and 3D of them with MDS:
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_1bman = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
all_df = na.omit(pdf[mm[, c("matchId", "is_over")], on = "matchId"])
all_df = all_df[, `:=`(is_over, as.character(is_over))]
scaled_df = scale(all_df[, 3:9])
dist_man = dist(scaled_df, method = "manhattan")
mds_man2 = data.table(cmdscale(dist_man, eig = TRUE, k = 2)$points)
mds_man2[, `:=`(is_over, all_df$is_over)]
dist_euc = dist(scaled_df, method = "euclidian")
mds_euc2 = data.table(cmdscale(dist_euc, eig = TRUE, k = 2)$points)
mds_euc2[, `:=`(is_over, all_df$is_over)]
p1 = qplot(data = mds_euc2, V1, V2, color = is_over) + ggtitle(paste("MDS Euclidian and is_over results for",
bookmakers[n_bm], sep = " "))
p2 = qplot(data = mds_man2, V1, V2, color = is_over) + ggtitle(paste("MDS Manhattan and is_over results for",
bookmakers[n_bm], sep = " "))
grid.arrange(p1, p2)
}
func_1bman(1)
func_1bman(2)
func_1bman(3)
func_1bman(4)
func_1bman(5)
Again for MDS it is really hard to say that classes are separated well. Graphs for manhattan and euclidian distances are different indeed due to one of them measures the path while the other calculates shortest path in R^n space. Thats why with manhattan distance makes distance matrix bigger and observations distributed the space in a larger scale like in graph of bet365.
Even though we cant observe a clear distinction for MDS and PCA, most probably MDS is a better choice for this data due to handling nonlinearity. When we compare plots on 1a and 1b for each bookmaker plots seem more convex for MDS which explains nonlinearity.
Feature engineering for this part:
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_2a = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
mm[, `:=`(is_1x2, ifelse(home_goal > away_goal, "1", ifelse(home_goal ==
away_goal, "x", "2")))]
all_1x2 = na.omit(pdf[mm[, c("matchId", "is_1x2")], on = "matchId"])
scaled_df = scale(all_1x2[, 3:9])
pca = prcomp(scaled_df, center = TRUE, scale. = TRUE)
print(summary(pca))
eigs = pca$sdev^2
exp_var_ratio = eigs/sum(eigs)
cum_exp_var_ratio = cumsum(exp_var_ratio)
plot(cum_exp_var_ratio, type = "l", xlab = "# of Principle Components",
ylab = "Cumulative Explained Variance")
title(paste("1x2 Results - Cumulative Explained Variance Ratio of PCA for ",
bookmakers[n_bm], sep = ""))
all_pca = predict(pca, newdata = scaled_df)
all_pca2d = all_pca[, 1:2]
all_pca2d = data.table(all_pca2d)
all_pca2d[, `:=`(is_1x2, all_1x2$is_1x2)]
ggplot(all_pca2d, aes(x = PC1, y = PC2, color = is_1x2)) +
geom_point() + ggtitle(paste("Transformed Data with p = 2 PCA and is_1x2 results",
bookmakers[n_bm], sep = " "))
}
func_2a(1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8620 1.4220 1.1528 0.31994 0.25363 0.09678
## Proportion of Variance 0.4953 0.2889 0.1898 0.01462 0.00919 0.00134
## Cumulative Proportion 0.4953 0.7842 0.9740 0.98863 0.99782 0.99916
## PC7
## Standard deviation 0.07657
## Proportion of Variance 0.00084
## Cumulative Proportion 1.00000
func_2a(2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7096 1.5209 1.2876 0.21767 0.20100 0.12273
## Proportion of Variance 0.4175 0.3304 0.2369 0.00677 0.00577 0.00215
## Cumulative Proportion 0.4175 0.7480 0.9848 0.99161 0.99738 0.99953
## PC7
## Standard deviation 0.05739
## Proportion of Variance 0.00047
## Cumulative Proportion 1.00000
func_2a(3)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7306 1.3078 1.2174 0.65847 0.53327 0.27811
## Proportion of Variance 0.4279 0.2443 0.2117 0.06194 0.04063 0.01105
## Cumulative Proportion 0.4279 0.6722 0.8839 0.94584 0.98647 0.99752
## PC7
## Standard deviation 0.13175
## Proportion of Variance 0.00248
## Cumulative Proportion 1.00000
func_2a(4)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8741 1.6091 0.8743 0.28487 0.16779 0.12440
## Proportion of Variance 0.5018 0.3699 0.1092 0.01159 0.00402 0.00221
## Cumulative Proportion 0.5018 0.8717 0.9809 0.99246 0.99648 0.99869
## PC7
## Standard deviation 0.09562
## Proportion of Variance 0.00131
## Cumulative Proportion 1.00000
func_2a(5)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.9234 1.5186 0.71947 0.61145 0.23300 0.17029
## Proportion of Variance 0.5285 0.3295 0.07395 0.05341 0.00776 0.00414
## Cumulative Proportion 0.5285 0.8579 0.93188 0.98529 0.99305 0.99719
## PC7
## Standard deviation 0.14018
## Proportion of Variance 0.00281
## Cumulative Proportion 1.00000
Using first 2 components is easy to visualize however in terms of explaining variance of the data its not beneficial. For 1x2 case we can clearly see that there is a pattern when classifying 1x2 result however we for some betmakers such as 10Bet this pattern cannot be observed. With that intuition we need to do deeper research like increasing dimension at PCA or using MDS.
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_2bman = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
mm[, `:=`(is_1x2, ifelse(home_goal > away_goal, "1", ifelse(home_goal ==
away_goal, "x", "2")))]
all_1x2 = na.omit(pdf[mm[, c("matchId", "is_1x2")], on = "matchId"])
all_1x2 = all_1x2[, `:=`(is_1x2, as.character(is_1x2))]
scaled_df = scale(all_1x2[, 3:9])
dist_man = dist(scaled_df, method = "manhattan")
mds_man2 = data.table(cmdscale(dist_man, eig = TRUE, k = 2)$points)
mds_man2[, `:=`(is_1x2, all_1x2$is_1x2)]
dist_euc = dist(scaled_df, method = "euclidian")
mds_euc2 = data.table(cmdscale(dist_euc, eig = TRUE, k = 2)$points)
mds_euc2[, `:=`(is_1x2, all_1x2$is_1x2)]
p1 = qplot(data = mds_euc2, V1, V2, color = is_1x2) + ggtitle(paste("MDS Euclidian and is_1x2 results for",
bookmakers[n_bm], sep = " "))
p2 = qplot(data = mds_man2, V1, V2, color = is_1x2) + ggtitle(paste("MDS Manhattan and is_1x2 results for",
bookmakers[n_bm], sep = " "))
grid.arrange(p1, p2)
}
func_2bman(1)
func_2bman(2)
func_2bman(3)
func_2bman(4)
func_2bman(5)
At MDS apporach, like PCA we can again see pattern between 1 and 2 classes separation however for 10Bet its not clear again. So MDS didnt help us. We need to change our dimension to see whether there is a pattern or not. For distance metric we can observe separation mostly in Manhattan. So manhattan is a better choice for this model.
Comparing MDS to PCA, MDS made better dimension reduction for this model as we can see from graphs (1xBet, bet365). This is caused by expanding initial data to higher dimensional distance metric gives us extra information about the data so that explained variance. Except 10Bet, we can see patterns in graphs that can give us an insight when classifying the 1x2 outcome of a match with unsupervised approach.
3.Image Analysis
img = readJPEG("/Users/ilkerkurtulus/Downloads/IMG_7955.jpg")
str(img)
## num [1:512, 1:512, 1:3] 0.424 0.4 0.388 0.404 0.4 ...
dim(img)
## [1] 512 512 3
if (exists("rasterImage")) {
plot(1:2, type = "n")
rasterImage(img, 1, 1, 2, 2)
}
if (exists("rasterImage")) {
plot(1:4, type = "n")
rasterImage(img[, , 1], 1, 1, 2, 2)
rasterImage(img[, , 2], 2, 1, 3, 2)
rasterImage(img[, , 3], 3, 1, 4, 2)
}
noise = runif(512 * 512 * 3, min = 0, max = 1)
dim(noise) = c(512, 512, 3)
nimg = noise + img
nimg_scaled = (nimg - min(nimg))/(max(nimg) - min(nimg))
if (exists("rasterImage")) {
plot(1:2, type = "n")
rasterImage(nimg_scaled, 1, 1, 2, 2)
}
if (exists("rasterImage")) {
plot(1:4, type = "n")
rasterImage(nimg_scaled[, , 1], 1, 1, 2, 2)
rasterImage(nimg_scaled[, , 2], 2, 1, 3, 2)
rasterImage(nimg_scaled[, , 3], 3, 1, 4, 2)
}
to_greyscale = function(img) {
img[, , 1] * 0.21 + img[, , 2] * 0.72 + img[, , 3] * 0.07
}
grey_nimg = to_greyscale(nimg_scaled)
pca_img = prcomp(grey_nimg, center = TRUE, scale. = TRUE)
eigs = pca_img$sdev^2
exp_var_ratio = eigs/sum(eigs)
cum_exp_var_ratio = cumsum(exp_var_ratio)
plot(cum_exp_var_ratio)
Even the image has 512 dimensions we can explain it well with 250 components with 0.95 confidence interval. However its still very large dimension for statistical inference. b) Construct patch matrix:
extract_path = function(data, i, j, n) {
dw = (n - 1)/2
as.vector(t(data[(i - dw):(i + dw), (j - dw):(j + dw)]))
}
tmp = c()
for (i in 2:511) {
for (j in 2:511) {
x = extract_path(grey_nimg, i, j, 3)
tmp = c(tmp, x)
}
}
res = matrix(tmp, nrow = 260100, byrow = TRUE)
Apply pca and plot:
pca_res = prcomp(res, center = TRUE, scale. = TRUE)
# 1st component
pca_res1d = pca_res$x[, 1]
pca_mat = matrix(pca_res1d, nrow = 510, ncol = 510, byrow = TRUE)
pca_mat_scaled = (pca_mat - min(pca_mat))/(max(pca_mat) - min(pca_mat))
if (exists("rasterImage")) {
plot(1:2, type = "n")
rasterImage(pca_mat_scaled, 1, 1, 2, 2)
}
# 2nd component
pca_res1d = pca_res$x[, 2]
pca_mat = matrix(pca_res1d, nrow = 510, ncol = 510, byrow = TRUE)
pca_mat_scaled = (pca_mat - min(pca_mat))/(max(pca_mat) - min(pca_mat))
if (exists("rasterImage")) {
plot(1:2, type = "n")
rasterImage(pca_mat_scaled, 1, 1, 2, 2)
}
# 3rd component
pca_res1d = pca_res$x[, 3]
pca_mat = matrix(pca_res1d, nrow = 510, ncol = 510, byrow = TRUE)
pca_mat_scaled = (pca_mat - min(pca_mat))/(max(pca_mat) - min(pca_mat))
if (exists("rasterImage")) {
plot(1:2, type = "n")
rasterImage(pca_mat_scaled, 1, 1, 2, 2)
}
ev1 = pca_res$rotation[, 1]
ev_scaled1 = (ev1 - min(ev1))/(max(ev1) - min(ev1))
mat_ev1 = matrix(ev_scaled1, nrow = 3, ncol = 3, byrow = TRUE)
ev2 = pca_res$rotation[, 2]
ev_scaled2 = (ev2 - min(ev2))/(max(ev2) - min(ev2))
mat_ev2 = matrix(ev_scaled2, nrow = 3, ncol = 3, byrow = TRUE)
ev3 = pca_res$rotation[, 3]
ev_scaled3 = (ev3 - min(ev3))/(max(ev3) - min(ev3))
mat_ev3 = matrix(ev_scaled3, nrow = 3, ncol = 3, byrow = TRUE)
if (exists("rasterImage")) {
plot(1:4, type = "n")
rasterImage(mat_ev1, 1, 1, 2, 2)
rasterImage(mat_ev2, 2, 1, 3, 2)
rasterImage(mat_ev3, 3, 1, 4, 2)
}